IDENTIFY MACRO-AREAS OR MARKET FIELDS within the database to acquire some general awareness on its content

library(tidyverse)

data <- read_csv("../../data/data_subset1.csv")

names(data)
library(skimr)
#skim(data)

Wordcloud

Codice da: Using tidytext to make word clouds (richpauloo.github.io)

library(dplyr) # for data wrangling
library(tidytext) # for NLP
library(stringr) # to deal with strings
library(wordcloud) # to render wordclouds
library(knitr) # for tables
library(DT) # for dynamic tables
library(tidyr)

tidy_dat <- tidyr::gather(data %>%
                            unite(col="united", abstract, title, sep=" ") %>%
                            select(united), key, word) %>% select(word)

tidy_dat
library(SnowballC)

#uso lo stemming perchè il lemming (udpipe_annotate) sarebbe troppo lento ad eseguire
tokens <- tidy_dat %>% 
  unnest_tokens(word, word) %>% 
  mutate(word = wordStem(word)) %>% 
  dplyr::count(word, sort = TRUE) %>% 
  ungroup()

data("stop_words")
tokens_clean <- tokens %>%
  anti_join(stop_words)

nums <- tokens_clean %>% filter(str_detect(word, "^[0-9]")) %>% select(word) %>% unique()

tokens_clean <- tokens_clean %>% 
  anti_join(nums, by = "word")

tokens_clean %>% head(100)
pal <- brewer.pal(8,"Dark2")

tokens_clean %>% 
  with(wordcloud(word, n, random.order = FALSE, max.words = 50, colors=pal))
uni_sw <- data.frame(word = c("claim", "based", "model",
                              "data", "invetion", "network",
                              "methods", "dataset",
                              "l3", "computer", "computing",
                              "device", "plurality",
                              "comprising", "word",
                              "user",
                              "based",
                              "comprises",
                              "system",
                              "image",
                              "model",
                              "processor",
                              "process",
                              "sentence",
                              "workload",
                              "method",
                              "data",
                              "system"
                              ))

tokens_clean <- tokens_clean %>% 
  anti_join(uni_sw, by = "word")
pal <- brewer.pal(8,"Dark2")

tokens_clean %>% 
  with(wordcloud(word, n, random.order = FALSE, max.words = 50, colors=pal))

Topic modeling

Codice da: Topic modeling | Text Mining with R (tidytextmining.com)

library(topicmodels)
data %>%
  group_by(filename)
data_word_count = data %>%
  select(filename, abstract, claims, title) %>%
  unite(col="united", abstract, title, sep=" ") %>%
  unnest_tokens(word, united) %>%
  mutate(word = wordStem(word)) %>% 
  count(filename, word, sort = TRUE) %>%
  anti_join(stop_words)

data_word_count %>% head(10)
data_dtm <- data_word_count %>%
  cast_dtm(filename, word, n)

data_dtm
data_lda <- LDA(data_dtm, k = 16, control = list(seed = 1234))
library(reshape2)

ap_topics <- tidy(data_lda, matrix = "beta")

ap_topics
ap_top_terms <- ap_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 7) %>% 
  ungroup() %>%
  arrange(topic, -beta)

ap_gg = ap_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()


ap_gg


beta_wide <- ap_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  pivot_wider(names_from = topic, values_from = beta) %>% 
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))

beta_wide

Bigrams

codice da Relationships between words: n-grams and correlations | Text Mining with R (tidytextmining.com)

bigrams <- data %>%
  unite(col="united", title, sep=" ") %>%
  unnest_tokens(bigram, united, token = "ngrams", n = 2) %>%
  select(bigram) #%>%
  #count(bigram, sort=TRUE)

bigrams
bigrams_separated <- bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  filter(!str_detect(word1, "^[0-9]")) %>%
  filter(!str_detect(word2, "^[0-9]"))

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  mutate(word1 = wordStem(word1)) %>% 
  mutate(word2 = wordStem(word2)) %>% 
  count(word1, word2, sort = TRUE)

bigram_counts
library(igraph)
library(ggraph)

bigram_graph <- bigram_counts %>%
  filter(n >= 500) %>%
  graph_from_data_frame()

set.seed(2017)

gg <- ggraph(bigram_graph, layout = "fr") + #fr o graphopt
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name),  hjust = 1, vjust=1) +
  theme_void()

gg

library(igraph)
library(ggiraph)

gg

Using CPC

codice sunburst da: Create Basic Sunburst Graphs with ggplot2 | by Yahia El Gamal | Optima . Blog | Medium

library(tidyverse)

cpc_descr <- read_csv("../../data/CPC_descriptions.csv")
Rows: 259657 Columns: 8
-- Column specification ----------------------------------------------
Delimiter: ","
chr (8): code_1, code_3, code_4, code_full, descr_1, descr_3, desc...

i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
cpc_descr %>% head(5)
data %>% head(5)
data %>% mutate(tmp = substr(ipc_classes[1], 0, 3))
data %>% 
  mutate(code_4 = tolower(substr(ipc_classes, 0, 4))) %>%
  inner_join(cpc_descr 
             %>% select(code_4, descr_1, descr_3, descr_4) %>% 
                distinct(),
             by="code_4"
             ) -> res

res %>% head()
res <- res %>%
  select(filename, descr_1, descr_3, descr_4) %>%
  count(filename, descr_1, descr_3, descr_4, sort = TRUE)

res
library(sunburstR)
res$descr_1<-gsub("-","_",as.character(res$descr_1))
res$descr_3<-gsub("-","_",as.character(res$descr_3))
res$descr_4<-gsub("-","_",as.character(res$descr_4))

dat <- res %>% 
  unite(col="united", descr_1, descr_3, descr_4, sep="-") %>%
  group_by(united) %>%
  count(sort= TRUE)

dat %>% head()
library(d3r)
  

sb3 <- sund2b(dat,
              breadcrumbs = sund2bBreadcrumb(enabled = TRUE), 
              showLabels=TRUE)

sb3
---
title: "R Notebook"
output: html_notebook
---

IDENTIFY MACRO-AREAS OR MARKET FIELDS within the database to acquire some general awareness on its content

```{r}
library(tidyverse)

data <- read_csv("../../data/data_subset1.csv")

names(data)
```

```{r}
library(skimr)
#skim(data)
```

## Wordcloud

Codice da: [Using tidytext to make word clouds (richpauloo.github.io)](https://richpauloo.github.io/2017-12-29-Using-tidytext-to-make-word-clouds/)

```{r}
library(dplyr) # for data wrangling
library(tidytext) # for NLP
library(stringr) # to deal with strings
library(wordcloud) # to render wordclouds
library(knitr) # for tables
library(DT) # for dynamic tables
library(tidyr)

tidy_dat <- tidyr::gather(data %>%
                            unite(col="united", abstract, title, sep=" ") %>%
                            select(united), key, word) %>% select(word)

tidy_dat

```

```{r}
library(SnowballC)

#uso lo stemming perchè il lemming (udpipe_annotate) sarebbe troppo lento ad eseguire
tokens <- tidy_dat %>% 
  unnest_tokens(word, word) %>% 
  mutate(word = wordStem(word)) %>% 
  dplyr::count(word, sort = TRUE) %>% 
  ungroup()

data("stop_words")
tokens_clean <- tokens %>%
  anti_join(stop_words)

nums <- tokens_clean %>% filter(str_detect(word, "^[0-9]")) %>% select(word) %>% unique()

tokens_clean <- tokens_clean %>% 
  anti_join(nums, by = "word")

tokens_clean %>% head(100)

```

```{r}
pal <- brewer.pal(8,"Dark2")

tokens_clean %>% 
  with(wordcloud(word, n, random.order = FALSE, max.words = 50, colors=pal))
```

```{r}
uni_sw <- data.frame(word = c("claim", "based", "model",
                              "data", "invetion", "network",
                              "methods", "dataset",
                              "l3", "computer", "computing",
                              "device", "plurality",
                              "comprising", "word",
                              "user",
                              "based",
                              "comprises",
                              "system",
                              "image",
                              "model",
                              "processor",
                              "process",
                              "sentence",
                              "workload",
                              "method",
                              "data",
                              "system"
                              ))

tokens_clean <- tokens_clean %>% 
  anti_join(uni_sw, by = "word")

```

```{r}
pal <- brewer.pal(8,"Dark2")

tokens_clean %>% 
  with(wordcloud(word, n, random.order = FALSE, max.words = 50, colors=pal))


```

## Topic modeling

Codice da: [Topic modeling \| Text Mining with R (tidytextmining.com)](https://www.tidytextmining.com/topicmodeling.html)

```{r}
library(topicmodels)
```

```{r}
data %>%
  group_by(filename)
```

```{r}
data_word_count = data %>%
  select(filename, abstract, claims, title) %>%
  unite(col="united", abstract, title, sep=" ") %>%
  unnest_tokens(word, united) %>%
  mutate(word = wordStem(word)) %>% 
  count(filename, word, sort = TRUE) %>%
  anti_join(stop_words)

data_word_count %>% head(10)
```

```{r}
data_dtm <- data_word_count %>%
  cast_dtm(filename, word, n)

data_dtm
```

```{r}
data_lda <- LDA(data_dtm, k = 16, control = list(seed = 1234))
```

```{r}
library(reshape2)

ap_topics <- tidy(data_lda, matrix = "beta")

ap_topics
```

```{r}
ap_top_terms <- ap_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 7) %>% 
  ungroup() %>%
  arrange(topic, -beta)

ap_gg = ap_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()


ap_gg

```

\

```{r}
beta_wide <- ap_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  pivot_wider(names_from = topic, values_from = beta) %>% 
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))

beta_wide
```

## Bigrams

codice da [Relationships between words: n-grams and correlations \| Text Mining with R (tidytextmining.com)](https://www.tidytextmining.com/ngrams.html)

```{r}
bigrams <- data %>%
  unite(col="united", title, sep=" ") %>%
  unnest_tokens(bigram, united, token = "ngrams", n = 2) %>%
  select(bigram) #%>%
  #count(bigram, sort=TRUE)

bigrams
```

```{r}
bigrams_separated <- bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  filter(!str_detect(word1, "^[0-9]")) %>%
  filter(!str_detect(word2, "^[0-9]"))

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  mutate(word1 = wordStem(word1)) %>% 
  mutate(word2 = wordStem(word2)) %>% 
  count(word1, word2, sort = TRUE)

bigram_counts
```

```{r}
library(igraph)
library(ggraph)

bigram_graph <- bigram_counts %>%
  filter(n >= 500) %>%
  graph_from_data_frame()

set.seed(2017)

gg <- ggraph(bigram_graph, layout = "fr") + #fr o graphopt
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name),  hjust = 1, vjust=1) +
  theme_void()

gg
```

```{r}

library(igraph)
library(ggiraph)

gg
```

### Using CPC

codice sunburst da: [Create Basic Sunburst Graphs with ggplot2 \| by Yahia El Gamal \| Optima . Blog \| Medium](https://medium.com/optima-blog/create-basic-sunburst-graphs-with-ggplot2-7d7484d92c61)

```{r}
library(tidyverse)

cpc_descr <- read_csv("../../data/CPC_descriptions.csv")

cpc_descr %>% head(5)
```

```{r}
data %>% head(5)
```

```{r}
data %>% mutate(tmp = substr(ipc_classes[1], 0, 3))
```

```{r}
data %>% 
  mutate(code_4 = tolower(substr(ipc_classes, 0, 4))) %>%
  inner_join(cpc_descr 
             %>% select(code_4, descr_1, descr_3, descr_4) %>% 
                distinct(),
             by="code_4"
             ) -> res

res %>% head()
```

```{r}
res <- res %>%
  select(filename, descr_1, descr_3, descr_4) %>%
  count(filename, descr_1, descr_3, descr_4, sort = TRUE)

res
```

```{r}
library(sunburstR)
res$descr_1<-gsub("-","_",as.character(res$descr_1))
res$descr_3<-gsub("-","_",as.character(res$descr_3))
res$descr_4<-gsub("-","_",as.character(res$descr_4))

dat <- res %>% 
  unite(col="united", descr_1, descr_3, descr_4, sep="-") %>%
  group_by(united) %>%
  count(sort= TRUE)

dat %>% head()
```

```{r}
library(d3r)
  

sb3 <- sund2b(dat,
              breadcrumbs = sund2bBreadcrumb(enabled = TRUE), 
              showLabels=TRUE)

sb3
```
